#Load packages
# Mathematics and Statistics Packages
import pandas as pd
import numpy as np
import statsmodels.api as sm
from scipy import stats
from scipy.stats import f_oneway
from sklearn.feature_selection import f_regression
# Graphical Packages
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
# Colab Packages
from google.colab import drive
from google.colab import data_table
# Data Preparation Packages
from sklearn.model_selection import train_test_split
# Linear Regression Packages
from sklearn.linear_model import LinearRegression
# Decision Tree Packages
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_text
from sklearn.tree import plot_tree
# Model Evaluation Packages
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
# Cross-Validation Packages
from sklearn.model_selection import cross_validate
#set random state value
random_state = 0
#pre defined function
def show_values(axs, orient="v", space=.01):
def _single(ax):
if orient == "v":
for p in ax.patches:
_x = p.get_x() + p.get_width() / 2
_y = p.get_y() + p.get_height() + (p.get_height()*0.01)
value = '{:.1f}'.format(p.get_height())
ax.text(_x, _y, value, ha="center")
elif orient == "h":
for p in ax.patches:
_x = p.get_x() + p.get_width() + float(space)
_y = p.get_y() + p.get_height() - (p.get_height()*0.5)
value = '{:.1f}'.format(p.get_width())
ax.text(_x, _y, value, ha="left")
if isinstance(axs, np.ndarray):
for idx, ax in np.ndenumerate(axs):
_single(ax)
else:
_single(axs)
drive.mount('/content/gdrive')
sales_df = pd.read_csv('/content/gdrive/MyDrive/Data_Sets/NA_sales_filtered.csv', encoding = "ISO-8859-1")
sales_df.info()
sales_df.describe(include='all')
sales_df.head(20)
#Explore name column
name_counts = sales_df['Name'].value_counts()
name_counts
#plot name column
name_counts_df = name_counts.rename_axis('name').reset_index(name='counts')
fig = plt.figure(figsize=(10,5))
ax = sns.barplot(x='name',y='counts',data=name_counts_df)
ticks = ax.set_xticklabels(ax.get_xticklabels(),rotation=0,size=12)
show_values(ax)
#Explore Platform column
platform_counts = sales_df['Platform'].value_counts()
platform_counts
#plot platform column
platform_counts_df = platform_counts.rename_axis('platform').reset_index(name='counts')
fig = plt.figure(figsize=(10,5))
ax = sns.barplot(x='platform',y='counts',data=platform_counts_df)
ticks = ax.set_xticklabels(ax.get_xticklabels(),rotation=0,size=12)
show_values(ax)
#Genre column
genre_counts = sales_df['Genre'].value_counts()
genre_counts
#plot genre column
genre_counts_df = genre_counts.rename_axis('genre').reset_index(name='counts')
fig = plt.figure(figsize=(10,5))
ax = sns.barplot(x='genre',y='counts',data=genre_counts_df)
ticks = ax.set_xticklabels(ax.get_xticklabels(),rotation=25,size=12)
show_values(ax)
#Critic_score column
fig = plt.figure(figsize=(10,5))
ax = sns.histplot(data=sales_df['Critic_Score'],bins=15)
show_values(ax)
#User_score column
fig = plt.figure(figsize=(10,5))
ax = sns.histplot(data=sales_df['User_Score'],bins=15)
show_values(ax)
The name column generally doesn't have any correlation with sales, and having the name column would cause problem to our model. Besides, the name column just serves as an index, which cannot be generalized to other datasets.
#Remove name column
sales_df = sales_df.drop(columns = "Name")
sales_df.info()
#Platform
sales_df.groupby(['Platform']).describe(include='float64')
#Genre
sales_df.groupby(['Genre']).describe(include='float64')
#Rating
sales_df.groupby(['Rating']).describe(include='float64')
#Critic_Score
sales_df['Critic_Score'] = sales_df['Critic_Score'].astype('float64')
#eliminate outliners of Critic_Score
q1_critic_score = sales_df['Critic_Score'].quantile(0.25)
q3_critic_score = sales_df['Critic_Score'].quantile(0.75)
iqr_critic_score = q3_critic_score - q1_critic_score
sales_df = sales_df.loc[ (sales_df['Critic_Score'] > q1_critic_score - 1.5 * iqr_critic_score) & (sales_df['Critic_Score'] < q3_critic_score + 1.5 * iqr_critic_score) ]
#Critic_Count
sales_df['Critic_Count'] = sales_df['Critic_Count'].astype('float64')
#eliminate outliners of Critic_Count
q1_critic_count = sales_df['Critic_Count'].quantile(0.25)
q3_critic_count = sales_df['Critic_Count'].quantile(0.75)
iqr_critic_count = q3_critic_count - q1_critic_count
sales_df = sales_df.loc[(sales_df['Critic_Count'] > q1_critic_count - 1.5 * iqr_critic_count) & (sales_df['Critic_Count'] < q3_critic_count + 1.5 * iqr_critic_count) ]
#User_Score
sales_df['User_Score'] = sales_df['User_Score'].astype('float64')
#eliminate outliners of User_Score
q1_user_score = sales_df['User_Score'].quantile(0.25)
q3_user_score = sales_df['User_Score'].quantile(0.75)
iqr_user_score = q3_user_score - q1_user_score
sales_df = sales_df.loc[(sales_df['User_Score'] > q1_user_score -1.5 * iqr_user_score) & (sales_df['User_Score'] < q3_user_score + 1.5 * iqr_user_score)]
#User_Count
sales_df['User_Count'] = sales_df['User_Count'].astype('float64')
#eliminate outliners of User_Count
q1_user_count = sales_df['User_Count'].quantile(0.25)
q3_user_count = sales_df['User_Count'].quantile(0.75)
iqr_user_count = q3_user_count - q1_user_count
sales_df = sales_df.loc[(sales_df['User_Count'] > q1_user_count -1.5 * iqr_user_count) & (sales_df['User_Count'] < q3_user_count + 1.5 * iqr_user_count)]
#eliminate outliners of NA_Sales
q1_sales = sales_df['NA_Sales'].quantile(0.25)
q3_sales = sales_df['NA_Sales'].quantile(0.75)
iqr_sales = q3_sales - q1_sales
sales_df = sales_df.loc[(sales_df['NA_Sales'] > q1_sales -1.5 * iqr_sales) & (sales_df['NA_Sales'] < q3_sales + 1.5 * iqr_sales)]
sales_df.info()
fig = plt.figure(figsize=(10,5))
ax = sns.histplot(data=sales_df['NA_Sales'], bins = 15, stat='probability')
show_values(ax)
#With Platform Hue
sns.pairplot(data = sales_df, hue = 'Platform')
#With Genre Hue
sns.pairplot(data = sales_df, hue = 'Genre')
#With Rating Hue
sns.pairplot(data = sales_df, hue = 'Rating')
sns.pairplot(data = sales_df, hue = 'NA_Sales')
#convert object variables to category
sales_df[sales_df.select_dtypes(include='object').columns] = sales_df[sales_df.select_dtypes(include='object').columns].astype('category')
sales_df.info()
#One hot encode
sales_onehot_df = pd.get_dummies(sales_df,columns =['Platform', 'Genre', 'Rating'])
sales_onehot_df.shape
correl_df = sales_onehot_df.corr()
correl_df
fig = plt.figure(figsize=(30,15))
ax = sns.heatmap(data=correl_df, annot=True, linecolor='black', linewidths=0.5, vmax=1.0, cmap='hot')
plt.show()
correl_df.abs().min().sort_values()
(1) strongest positive correlation.
User_Score and Critic_Score 0.52
(2) strongest negative correlation.
Rating_T and Rating_E -0.53
(3) least correlation.
Genre_Sports and Platform_Wii 0.000058
(1) an independent variable most likely to affect the results of a regression model.
Critic_Score
(2) two pairs of variables that are collinear.
User_Score and Critic_Score
Critic_Count and User_Count
train_df, test_df = train_test_split(sales_onehot_df, test_size=0.3, random_state=random_state)
#Training dataset
train_df.info()
train_df.describe(include='all')
#Testing dataset
test_df.info()
test_df.describe(include='all')
#testing set
fig = plt.figure(figsize=(10,5))
ax = sns.histplot(data=train_df['NA_Sales'], bins = 15, stat='probability')
show_values(ax)
#training set
fig = plt.figure(figsize=(10,5))
ax = sns.histplot(data=test_df['NA_Sales'], bins = 15, stat='probability')
show_values(ax)
The distributions of NA_Sales in the training and testing set are the same with the distributions of NA_Sales in the entire dataset, because of random splitting.
#list of all variables
variable_list = list(set(list(train_df.columns)+list(test_df.columns)))
print("All Variables: "+str(variable_list))
#independent variable seperation
sales_index = variable_list.index('NA_Sales')
dependent_variable_list = [variable_list.pop(sales_index)]
print("Dependent Variable: "+str(dependent_variable_list))
independent_variable_list = sorted(variable_list)
print("Independent Variables: "+str(independent_variable_list))
#independent variable seperation of training and testing set
X_train = train_df[independent_variable_list].to_numpy()
print("Training Independent Data Shape: "+str(X_train.shape))
y_train_ground_truth = train_df[dependent_variable_list].to_numpy().reshape(-1)
print("Training Dependent Data Shape: "+str(y_train_ground_truth.shape))
print("Training Data Set Shape: "+str(train_df.shape))
print("\n")
X_test = test_df[independent_variable_list].to_numpy()
print("Testing Independent Data Shape: "+str(X_test.shape))
y_test_ground_truth = test_df[dependent_variable_list].to_numpy().reshape(-1)
print("Testing Dependent Data Shape: "+str(y_test_ground_truth.shape))
print("Testing Data Set Shape: "+str(test_df.shape))
print("\n")
X = sales_onehot_df[independent_variable_list].to_numpy()
print("Original Independent Data Shape: "+str(X.shape))
y_ground_truth = sales_onehot_df[dependent_variable_list].to_numpy().reshape(-1)
print("Original Dependent Data Shape: "+str(y_ground_truth.shape))
print("Original Data Set Shape: "+str(sales_onehot_df.shape))
#A:Train a simple regression model
lin_reg_critic_count = LinearRegression().fit(X_train[:,0].reshape(-1,1), y_train_ground_truth)
#B:Predict
critic_count_pred = lin_reg_critic_count.predict(X_test[:,0].reshape(-1,1))
#C:Metrics
#Slope Coefficient
print(str(lin_reg_critic_count.coef_[0].round(5)))
#Intercept Coefficient
print(str(lin_reg_critic_count.intercept_.round(5)))
#Training Coefficient of Determination
print(str(lin_reg_critic_count.score(X_train[:,0].reshape(-1,1),y_train_ground_truth).round(5)))
#MAE
print(str(mean_absolute_error(y_test_ground_truth, critic_count_pred).round(5)))
#MSE
print(str(mean_squared_error(y_test_ground_truth, critic_count_pred).round(5)))
#Testing Coefficent of Determination
print(str(r2_score(y_test_ground_truth,critic_count_pred).round(5)))
#D:Plot the model
testing_instances = critic_count_pred.shape[0]
critic_count_data = np.zeros(shape=(testing_instances,2))
critic_count_data[:,0] = X_test[:,0]
critic_count_data[:,1] = y_test_ground_truth
critic_count_xvalues = np.arange(sales_onehot_df['Critic_Count'].min(),sales_onehot_df['Critic_Count'].max(),.001)
critic_count_model_line = lin_reg_critic_count.coef_[0] * critic_count_xvalues + lin_reg_critic_count.intercept_
plt.scatter(critic_count_data[:,0],critic_count_data[:,1])
plt.plot(critic_count_xvalues,critic_count_model_line,c= "red",linestyle='-')
plt.xlabel("Critic_Count")
plt.ylabel("NA_Sales")
plt.show()
#E:Calculate P-value
slope, intercept, r_value, p_value, std_err = stats.linregress(X_test[:,0],y_test_ground_truth)
print("p-Value from SciPy: "+str(p_value))
The model is statistically significant
#A:Train a simple regression model
lin_reg_critic_score = LinearRegression().fit(X_train[:,1].reshape(-1,1), y_train_ground_truth)
#B:Predict
critic_score_pred = lin_reg_critic_score.predict(X_test[:,1].reshape(-1,1))
#C:Metrics
#Slope Coefficient
print(str(lin_reg_critic_score.coef_[0].round(5)))
#Intercept Coefficient
print(str(lin_reg_critic_score.intercept_.round(5)))
#Training Coefficient of Determination
print(str(lin_reg_critic_score.score(X_train[:,1].reshape(-1,1),y_train_ground_truth).round(5)))
#MAE
print(str(mean_absolute_error(y_test_ground_truth, critic_score_pred).round(5)))
#MSE
print(str(mean_squared_error(y_test_ground_truth, critic_score_pred).round(5)))
#Testing Coefficent of Determination
print(str(r2_score(y_test_ground_truth,critic_score_pred).round(5)))
#D:Plot the model
testing_instances = critic_score_pred.shape[0]
critic_score_data = np.zeros(shape=(testing_instances,2))
critic_score_data[:,0] = X_test[:,1]
critic_score_data[:,1] = y_test_ground_truth
critic_score_xvalues = np.arange(sales_onehot_df['Critic_Score'].min(),sales_onehot_df['Critic_Score'].max(),.001)
critic_score_model_line = lin_reg_critic_score.coef_[0] * critic_score_xvalues + lin_reg_critic_score.intercept_
plt.scatter(critic_score_data[:,0],critic_score_data[:,1])
plt.plot(critic_score_xvalues,critic_score_model_line,c= "red",linestyle='-')
plt.xlabel("Critic_Score")
plt.ylabel("NA_Sales")
plt.show()
#E:Calculate P-value
slope, intercept, r_value, p_value, std_err = stats.linregress(X_test[:,1],y_test_ground_truth)
print("p-Value from SciPy: "+str(p_value))
The model is statistically significant
#A:Train a simple regression model
lin_reg_user_count = LinearRegression().fit(X_train[:,-2].reshape(-1,1), y_train_ground_truth)
#B:Predict
user_count_pred = lin_reg_user_count.predict(X_test[:,-2].reshape(-1,1))
#C:Metrics
#Slope Coefficient
print(str(lin_reg_user_count.coef_[0].round(5)))
#Intercept Coefficient
print(str(lin_reg_user_count.intercept_.round(5)))
#Training Coefficient of Determination
print(str(lin_reg_user_count.score(X_train[:,-2].reshape(-1,1),y_train_ground_truth).round(5)))
#MAE
print(str(mean_absolute_error(y_test_ground_truth, user_count_pred).round(5)))
#MSE
print(str(mean_squared_error(y_test_ground_truth, user_count_pred).round(5)))
#Testing Coefficent of Determination
print(str(r2_score(y_test_ground_truth,user_count_pred).round(5)))
#D:Plot the model
testing_instances = user_count_pred.shape[0]
user_count_data = np.zeros(shape=(testing_instances,2))
user_count_data[:,0] = X_test[:,-2]
user_count_data[:,1] = y_test_ground_truth
user_count_xvalues = np.arange(sales_onehot_df['User_Count'].min(),sales_onehot_df['User_Count'].max(),.001)
user_count_model_line = lin_reg_user_count.coef_[0] * user_count_xvalues + lin_reg_user_count.intercept_
plt.scatter(user_count_data[:,0],user_count_data[:,1])
plt.plot(user_count_xvalues,user_count_model_line,c= "red",linestyle='-')
plt.xlabel("User_Count")
plt.ylabel("NA_Sales")
plt.show()
#E:Calculate P-value
slope, intercept, r_value, p_value, std_err = stats.linregress(X_test[:,-2],y_test_ground_truth)
print("p-Value from SciPy: "+str(p_value))
The model is statistically significant
#A:Train a simple regression model
lin_reg_user_score = LinearRegression().fit(X_train[:,-1].reshape(-1,1), y_train_ground_truth)
#B:Predict
user_score_pred = lin_reg_user_score.predict(X_test[:,-1].reshape(-1,1))
#C:Metrics
#Slope Coefficient
print(str(lin_reg_user_score.coef_[0].round(5)))
#Intercept Coefficient
print(str(lin_reg_user_score.intercept_.round(5)))
#Training Coefficient of Determination
print(str(lin_reg_user_score.score(X_train[:,-1].reshape(-1,1),y_train_ground_truth).round(5)))
#MAE
print(str(mean_absolute_error(y_test_ground_truth, user_score_pred).round(5)))
#MSE
print(str(mean_squared_error(y_test_ground_truth, user_score_pred).round(5)))
#Testing Coefficent of Determination
print(str(r2_score(y_test_ground_truth,user_score_pred).round(5)))
#D:Plot the model
testing_instances = user_score_pred.shape[0]
user_score_data = np.zeros(shape=(testing_instances,2))
user_score_data[:,0] = X_test[:,-1]
user_score_data[:,1] = y_test_ground_truth
user_score_xvalues = np.arange(sales_onehot_df['User_Score'].min(),sales_onehot_df['User_Score'].max(),.001)
user_score_model_line = lin_reg_user_score.coef_[0] * user_score_xvalues + lin_reg_user_score.intercept_
plt.scatter(user_score_data[:,0],user_score_data[:,1])
plt.plot(user_score_xvalues,user_score_model_line,c= "red",linestyle='-')
plt.xlabel("User_Score")
plt.ylabel("NA_Sales")
plt.show()
#E:Calculate P-value
slope, intercept, r_value, p_value, std_err = stats.linregress(X_test[:,-1],y_test_ground_truth)
print("p-Value from SciPy: "+str(p_value))
The model is statistically significant
#train
lin_reg_full = LinearRegression().fit(X_train,y_train_ground_truth)
#predict
lin_reg_full_pred = lin_reg_full.predict(X_test)
#Slope Coefficient
pd.DataFrame({'Feature':independent_variable_list,'Slope Coefficient':lin_reg_full.coef_})
# Other Metrics
#Intercept Coefficient
print("Intercept: "+str(lin_reg_full.intercept_.round(5)))
#Training Coefficient of Determination
print("Training Coefficient of Determination (R-Squared): "+str(lin_reg_full.score(X_train,y_train_ground_truth).round(5)))
# MAE
print("Mean Absolute Error: "+str(mean_absolute_error(y_test_ground_truth,lin_reg_full_pred).round(5)))
# MSE
print("Mean Squared Error: "+str(mean_squared_error(y_test_ground_truth,lin_reg_full_pred).round(5)))
#Testing Coefficent of Determination
print("Testing Coefficient of Determination (R-Squared): "+str(r2_score(y_test_ground_truth,lin_reg_full_pred).round(5)))
#Calculate P-value
X2 = sm.add_constant(X_test)
estimator = sm.OLS(y_test_ground_truth,X2)
print("p-Value from StatsModels: "+str(estimator.fit().f_pvalue))
This model is statistically significant
platform_ps_index = independent_variable_list.index('Platform_PS')
#A:Train a simple regression model
lin_reg_platform_ps = LinearRegression().fit(X_train[:,platform_ps_index].reshape(-1,1), y_train_ground_truth)
#B:Predict
platform_ps_pred = lin_reg_platform_ps.predict(X_test[:,platform_ps_index].reshape(-1,1))
#C:Metrics
#Slope Coefficient
print(str(lin_reg_platform_ps.coef_[0].round(5)))
#Intercept Coefficient
print(str(lin_reg_platform_ps.intercept_.round(5)))
#Training Coefficient of Determination
print(str(lin_reg_platform_ps.score(X_train[:,platform_ps_index].reshape(-1,1),y_train_ground_truth).round(5)))
#MAE
print(str(mean_absolute_error(y_test_ground_truth, platform_ps_pred).round(5)))
#MSE
print(str(mean_squared_error(y_test_ground_truth, platform_ps_pred).round(5)))
#Testing Coefficent of Determination
print(str(r2_score(y_test_ground_truth,platform_ps_pred).round(5)))
#D:Plot the model
testing_instances = platform_ps_pred.shape[0]
platform_ps_data = np.zeros(shape=(testing_instances,2))
platform_ps_data[:,0] = X_test[:,platform_ps_index]
platform_ps_data[:,1] = y_test_ground_truth
platform_ps_xvalues = np.arange(sales_onehot_df['Platform_PS'].min(),sales_onehot_df['Platform_PS'].max(),.001)
platform_ps_model_line = lin_reg_platform_ps.coef_[0] * platform_ps_xvalues + lin_reg_platform_ps.intercept_
plt.scatter(platform_ps_data[:,0],platform_ps_data[:,1])
plt.plot(platform_ps_xvalues,platform_ps_model_line,c= "red",linestyle='-')
plt.xlabel("Platform_PS")
plt.ylabel("NA_Sales")
plt.show()
#E:Calculate P-value
slope, intercept, r_value, p_value, std_err = stats.linregress(X_test[:,platform_ps_index],y_test_ground_truth)
print("p-Value from SciPy: "+str(p_value))
#Calculate P-valueusing Anova
test_statistic, p_value = f_regression(X_test[:,platform_ps_index].reshape(-1,1),y_test_ground_truth)
print(str('{:0.3e}'.format(p_value[0])))
The model is not statistically significant
#User_count column
user_counts = sales_df['User_Count'].value_counts()
user_counts
#plot User_count column
user_counts_df = user_counts.rename_axis('User_Count').reset_index(name='counts')
fig = plt.figure(figsize=(10,5))
ax = sns.barplot(x='User_Count',y='counts',data=user_counts_df)
ticks = ax.set_xticklabels(ax.get_xticklabels(),rotation=25,size=12)
show_values(ax)
sales_onehot_df['log_user_count'] = np.log10(sales_onehot_df['User_Count'])
sales_onehot_df.info()
sales_onehot_df_train, sales_onehot_df_test, y_train_ground_truth, y_test_ground_truth = train_test_split(sales_onehot_df.drop('NA_Sales', axis = 1),
sales_onehot_df['NA_Sales'],
train_size=0.7,
random_state = random_state)
#A:Train a simple regression model
lin_reg_log_user_count = LinearRegression().fit(sales_onehot_df_train.to_numpy()[:,37].reshape(-1,1), y_train_ground_truth)
#B:Predict
log_user_count_pred = lin_reg_log_user_count.predict(sales_onehot_df_test.to_numpy()[:,37].reshape(-1,1))
#C:Metrics
#Slope Coefficient
print(str(lin_reg_log_user_count.coef_[0].round(5)))
#Intercept Coefficient
print(str(lin_reg_log_user_count.intercept_.round(5)))
#Training Coefficient of Determination
print(str(lin_reg_log_user_count.score(sales_onehot_df_train.to_numpy()[:,37].reshape(-1,1),y_train_ground_truth).round(5)))
#MAE
print(str(mean_absolute_error(y_test_ground_truth, log_user_count_pred).round(5)))
#MSE
print(str(mean_squared_error(y_test_ground_truth, log_user_count_pred).round(5)))
#Testing Coefficent of Determination
print(str(r2_score(y_test_ground_truth,log_user_count_pred).round(5)))
#D:Plot the model
testing_instances = log_user_count_pred.shape[0]
log_user_count_data = np.zeros(shape=(testing_instances,2))
log_user_count_data[:,0] = sales_onehot_df_test.to_numpy()[:,37]
log_user_count_data[:,1] = y_test_ground_truth
log_user_count_xvalues = np.arange(sales_onehot_df['log_user_count'].min(),sales_onehot_df['log_user_count'].max(),.001)
log_user_count_model_line = lin_reg_log_user_count.coef_[0] * log_user_count_xvalues + lin_reg_log_user_count.intercept_
plt.scatter(log_user_count_data[:,0],log_user_count_data[:,1])
plt.plot(log_user_count_xvalues,log_user_count_model_line,c= "red",linestyle='-')
plt.xlabel("Log_User_Count")
plt.ylabel("NA_Sales")
plt.show()
#E:Calculate P-value
slope, intercept, r_value, p_value, std_err = stats.linregress(sales_onehot_df_test.to_numpy()[:,37],y_test_ground_truth)
print("p-Value from SciPy: "+str(p_value))
The model is statistically significant
#train
lin_reg_full_2 = LinearRegression().fit(sales_onehot_df_train,y_train_ground_truth)
#predict
lin_reg_full_pred_2 = lin_reg_full_2.predict(sales_onehot_df_test)
independent_variable_list = list(sales_onehot_df.columns)
independent_variable_list.remove('NA_Sales')
#Slope Coefficient
pd.DataFrame({'Feature':independent_variable_list,'Slope Coefficient':lin_reg_full_2.coef_})
# Other Metrics
#Intercept Coefficient
print('Intercept: '+str(lin_reg_full_2.intercept_.round(5)))
#Training Coefficient of Determination
print("Training Coefficient of Determination (R-Squared): "+str(lin_reg_full_2.score(sales_onehot_df_train,y_train_ground_truth).round(5)))
# MAE
print("Mean Absolute Error: "+str(mean_absolute_error(y_test_ground_truth,lin_reg_full_pred_2).round(5)))
# MSE
print("Mean Squared Error: "+str(mean_squared_error(y_test_ground_truth,lin_reg_full_pred_2).round(5)))
#Testing Coefficent of Determination
print("Testing Coefficient of Determination (R-Squared): "+str(r2_score(y_test_ground_truth,lin_reg_full_pred_2).round(5)))
#Calculate P-value
X2 = sm.add_constant(sales_onehot_df_test.to_numpy())
estimator = sm.OLS(y_test_ground_truth,X2)
print("p-Value from StatsModels: "+str(estimator.fit().f_pvalue))
This model is statistically significant
metrics from original data set:
Intercept: -0.04913
Training Coefficient of Determination (R-Squared): 0.25275
Mean Absolute Error: 0.11447
Mean Squared Error: 0.02214
Testing Coefficient of Determination (R-Squared): 0.20912
Comparing this nonlinear multivariate model to fully linear multivariate model, there's increase in coefficient of determination in both training and testing dataset. There's decreased errors and p-value.
%%shell
jupyter nbconvert --to html /content/gdrive/MyDrive/Data_Sets/Assignment5_Kim_Sarah.ipynb